Goal: can machine learning methods help us to associate metabolites with leaf length?
Previously (script 11b2) I filtered out unnamed metabolites. Here I keep them all.
Also I will PC separately for root and leaf.
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.0-2
library(relaimpo)
Loading required package: MASS
Loading required package: boot
Loading required package: survey
Loading required package: grid
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:boot’:
aml
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
Loading required package: mitools
This is the global version of package relaimpo.
If you are a non-US user, a version with the interesting additional metric pmvd is available
from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.0.4 ✓ dplyr 1.0.2
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.0
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x tidyr::pack() masks Matrix::pack()
x dplyr::select() masks MASS::select()
x tidyr::unpack() masks Matrix::unpack()
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, leaf_avg_std)
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
pot = col_double(),
soil = col_character(),
genotype = col_character(),
trt = col_character(),
leaf_avg = col_double(),
leaf_avg_std = col_double()
)
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
tissue = col_character(),
soil = col_character(),
genotype = col_character(),
autoclave = col_character(),
time_point = col_character(),
concatenate = col_character()
)
ℹ Use `spec()` for the full column specifications.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
#adjust by sample mass
mutate(met_per_mg=met_amount/sample_mass) %>%
#scale and center
group_by(metabolite, genotype, tissue) %>%
mutate(met_per_mg=scale(met_per_mg),
met_amt=scale(met_amount)
) %>%
pivot_wider(id_cols = sampleID,
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
get leaf data order to match
leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength
met_per_mg.leaf_PCA <- met_per_mg %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized leaf metabolites")
met_per_mg.root_PCA <- met_per_mg %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized root metabolites")
met_amt.leaf_PCA <- met_amt %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw leaf metabolites")
met_amt.root_PCA <- met_amt %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw root metabolites")
are the PCs normalized?
colMeans(met_amt.leaf_PCA$x) %>% round(3) #yes centered
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
PC27 PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
0 0 0 0 0 0 0 0 0 0
apply(met_amt.leaf_PCA$x, 2, sd) %>% round(2) #not scaled
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22
11.94 8.37 6.74 5.84 5.46 5.32 5.02 4.55 4.38 4.11 3.87 3.80 3.73 3.56 3.44 3.32 3.27 3.17 3.16 3.08 2.94 2.90
PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
2.86 2.78 2.73 2.61 2.56 2.53 2.40 2.39 2.33 2.26 2.22 2.14 0.00 0.00
combine the leaf and root, and then scale them:
met_per_mg.leaf_PCs <- met_per_mg.leaf_PCA$x
colnames(met_per_mg.leaf_PCs) <- str_c("leaf_", colnames(met_per_mg.leaf_PCs))
met_per_mg.root_PCs <- met_per_mg.root_PCA$x
colnames(met_per_mg.root_PCs) <- str_c("root_", colnames(met_per_mg.root_PCs))
met_per_mg.PCs <- cbind(met_per_mg.leaf_PCs, met_per_mg.root_PCs) %>%
scale()
met_amt.leaf_PCs <- met_amt.leaf_PCA$x
colnames(met_amt.leaf_PCs) <- str_c("leaf_", colnames(met_amt.leaf_PCs))
met_amt.root_PCs <- met_amt.root_PCA$x
colnames(met_amt.root_PCs) <- str_c("root_", colnames(met_amt.root_PCs))
met_amt.PCs <- cbind(met_amt.leaf_PCs, met_amt.root_PCs) %>%
scale()
also combine the rotations
met_per_mg.leaf_rotation <- met_per_mg.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.root_rotation <- met_per_mg.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.PC_rotation <- full_join(met_per_mg.leaf_rotation, met_per_mg.root_rotation, by="metabolite")
met_amt.leaf_rotation <- met_amt.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_amt.root_rotation <- met_amt.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_amt.PC_rotation <- full_join(met_amt.leaf_rotation, met_amt.root_rotation, by="metabolite")
met_per_mg_fit1LOO <- cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, nfolds = nrow(met_per_mg.PCs), alpha=1 )
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(met_per_mg_fit1LOO)
bestlam=met_per_mg_fit1LOO$lambda.1se
NEXT STEP: Do a K-fold CV, repeat many times and average. Might as well do alpha while we are at it. If we are doing alpha, then we need to manually create our own folds list for each run
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
93.102 4.877 118.425
head(met_per_mg_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_per_mg_multiCV <- met_per_mg_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_per_mg_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
`summarise()` ungrouping output (override with `.groups` argument)
met_per_mg_summary_lambda
plot it
met_per_mg_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue")
So overall these look more reasonable than the LOO plot.
Make a plot of MSE at minimum lambda for each alpha
met_per_mg_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference there, aside from 0.1 and even then, not too much better.
Plot the number of nzero coefficients
met_per_mg_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
per_mg_fit_test_train <- met_per_mg_summary_lambda %>%
select(alpha, lambda.min.mean)
per_mg_fit_test_train <- met_per_mg_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(per_mg_fit_test_train)
Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_per_mg.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_per_mg.PCs)))
[1] 13.36988
[1] 0
[1] 0.7666849
[1] 0.1
[1] 0.8732274
[1] 0.2
[1] 0.6950232
[1] 0.3
[1] 0.6122284
[1] 0.4
[1] 0.5404081
[1] 0.5
[1] 0.4719584
[1] 0.6
[1] 0.4194147
[1] 0.7
[1] 0.3761253
[1] 0.8
[1] 0.3410749
[1] 0.9
[1] 0.3097519
[1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_per_mg <- .8
best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg)
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean
per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.57
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
t = 4.1276, df = 34, p-value = 0.0002243
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3076242 0.7617163
sample estimates:
cor
0.5777675
best_per_mg$full_MSE
[1] 0.7717674
per_mg_vars <- per_mg_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
calc.relimp()
per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_per_mg=V1) %>%
full_join(per_mg_coef.tb) %>%
arrange(desc(PropVar_met_per_mg))
Joining, by = "PC"
per_mg_coef.tb
NA
Checkout the rotations.
met_per_mg_rotation_out <- met_per_mg.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(per_mg_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_normalized.csv")
met_per_mg_rotation_out
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,6))))
system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
70.687 2.897 87.024
head(met_amt_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_amt_multiCV <- met_amt_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_amt_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
`summarise()` ungrouping output (override with `.groups` argument)
met_amt_summary_lambda
plot it
met_amt_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_amt_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference here after 0.2
Plot the number of nzero coefficients
met_amt_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=36, train_size=30, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
amt_fit_test_train <- met_amt_summary_lambda %>%
select(alpha, lambda.min.mean)
amt_fit_test_train <- met_amt_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(amt_fit_test_train)
Joining, by = "alpha"
amt_fit_test_train <- amt_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
[1] 121.4399
[1] 0
[1] 0.9724185
[1] 0.1
[1] 0.6990522
[1] 0.2
[1] 0.586571
[1] 0.3
[1] 0.5167138
[1] 0.4
[1] 0.444683
[1] 0.5
[1] 0.3928743
[1] 0.6
[1] 0.3464051
[1] 0.7
[1] 0.3173148
[1] 0.8
[1] 0.2852059
[1] 0.9
[1] 0.2623291
[1] 1
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_amt <- .8
best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt)
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean
amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.736
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_amt$pred_full[[1]]
t = 6.3375, df = 34, p-value = 3.15e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5372665 0.8571965
sample estimates:
cor
0.7359065
best_amt$full_MSE
[1] 0.6064862
amt_vars <- amt_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>%
calc.relimp()
amt_coef.tb <- amt_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_amt=V1) %>%
full_join(amt_coef.tb) %>%
arrange(desc(PropVar_met_amt))
Joining, by = "PC"
amt_coef.tb
NA
Checkout the rotations.
met_amt_rotation_out <- met_amt.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(amt_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="raw",
metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_raw.csv")
met_amt_rotation_out